library(tidyverse) # for general analysis
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0 ✓ purrr 0.3.3
## ✓ tibble 3.0.4 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(fpp)
## Loading required package: forecast
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Loading required package: fma
## Loading required package: expsmooth
## Loading required package: lmtest
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: tseries
library(fpp3) # for predictions
## ── Attaching packages ────────────────────────────────────────────── fpp3 0.3 ──
## ✓ lubridate 1.7.4 ✓ feasts 0.1.5
## ✓ tsibble 0.9.3 ✓ fable 0.2.1
## ✓ tsibbledata 0.2.0
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## x lubridate::date() masks base::date()
## x dplyr::filter() masks stats::filter()
## x fabletools::forecast() masks forecast::forecast()
## x tsibble::index() masks zoo::index()
## x tsibble::interval() masks lubridate::interval()
## x dplyr::lag() masks stats::lag()
## x tsibble::new_interval() masks lubridate::new_interval()
library(ggthemes) # for beautiful themes
library(TSA) # for TS analysis
## Registered S3 methods overwritten by 'TSA':
## method from
## fitted.Arima forecast
## plot.Arima forecast
##
## Attaching package: 'TSA'
## The following object is masked from 'package:readr':
##
## spec
## The following objects are masked from 'package:stats':
##
## acf, arima
## The following object is masked from 'package:utils':
##
## tar
library(kableExtra) # for beautiful tables
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(lubridate) # for time data
library(tsibble) # for tsiblle data format
library(caret) # for modeling
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:fabletools':
##
## MAE, RMSE
## The following object is masked from 'package:purrr':
##
## lift
library(stats)
library(ggplot2)
library(MuMIn)
library(Metrics)
##
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
##
## precision, recall
## The following object is masked from 'package:fabletools':
##
## accuracy
## The following object is masked from 'package:forecast':
##
## accuracy
library(tstools)
# Set a theme
theme_set(theme_minimal())
Data is imported and transformed into a time series object.
sales_month <- read_csv("Zillow monthly raw data (full).csv") #import raw data file
## Parsed with column specification:
## cols(
## .default = col_double(),
## RegionName = col_character(),
## RegionType = col_character(),
## StateName = col_character()
## )
## See spec(...) for full column specifications.
chicago_sales_m <- sales_month %>% filter(RegionName == "Chicago, IL") #import Chicago data only
chicago_sales_m <- chicago_sales_m %>% #select median price and data only
gather(key = "Date", value = "Price", -RegionID:-StateName) %>%
select(Date, Price) %>%
rename(Median_price = Price)
chicago_sales_m$Date <- mdy(chicago_sales_m$Date)
dim(chicago_sales_m)
## [1] 151 2
chi_median <- ts(chicago_sales_m$Median_price, start=c(2008,2), frequency=12)
Data is complete, and in appropriate form.
sum(is.na(chi_median)) #check for null values
## [1] 0
frequency(chi_median) #check correct frequency of time series
## [1] 12
cycle(chi_median) #check appropriate structure of time series
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 2008 2 3 4 5 6 7 8 9 10 11 12
## 2009 1 2 3 4 5 6 7 8 9 10 11 12
## 2010 1 2 3 4 5 6 7 8 9 10 11 12
## 2011 1 2 3 4 5 6 7 8 9 10 11 12
## 2012 1 2 3 4 5 6 7 8 9 10 11 12
## 2013 1 2 3 4 5 6 7 8 9 10 11 12
## 2014 1 2 3 4 5 6 7 8 9 10 11 12
## 2015 1 2 3 4 5 6 7 8 9 10 11 12
## 2016 1 2 3 4 5 6 7 8 9 10 11 12
## 2017 1 2 3 4 5 6 7 8 9 10 11 12
## 2018 1 2 3 4 5 6 7 8 9 10 11 12
## 2019 1 2 3 4 5 6 7 8 9 10 11 12
## 2020 1 2 3 4 5 6 7 8
There are two distinctive trends - a negative trend during and in the wake of the 2008 financial crisis (2008-2013), followed by a positive trend until the present. There is also clear additive seasonality. However, this seasonal trend has been impacted by the current COVID-19 pandemic.
autoplot(chi_median, main="Chicago: Median Home Sale Price")
min(chi_median) #$149,000, which correspondts to February 2013
## [1] 149000
max(chi_median) #$279,000 which corresponds to August 2020
## [1] 270000
Decomposition of the time series validates initial impression of the data. However there appears to be more than one seasonal trend, evidenced by the “seasonal” and “random” decompsitions.
plot(decompose(chi_median))
Median price peaks during the summer months. The range between median prices is also narrower in the spring/summer months than in the rest of the year.
boxplot(chi_median~cycle(chi_median), xlab="Cycle (Year)", ylab="Dollars", main="Chicago: Median Home Sales Price")
Both the ACF and PACF decay sinusodially slowly over time, indicating seasonality and a significant linear relationship between the series and its lags.
acf(chi_median, main="ACF Chicago Median Home Sales", lag.max=50)
pacf(chi_median, main="PACF Chicago Median Home Sales", lag.max=50)
Looking at the Raw Periodogram, we can see that there are many spikes and therefore many dominating frequencies. The periodgram has two spikes, which will be most important to the overall signal.
spectrum(chi_median)
periodogram(chi_median)
Create train/test split for modelling.
chi_train<-window(chi_median, end = 2019.6) #end train in August 2019
chi_test<-window(chi_median, start = 2019.6) #use last year of dataset as test
#double-check time series structure
frequency(chi_train)
## [1] 12
cycle(chi_train)
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 2008 2 3 4 5 6 7 8 9 10 11 12
## 2009 1 2 3 4 5 6 7 8 9 10 11 12
## 2010 1 2 3 4 5 6 7 8 9 10 11 12
## 2011 1 2 3 4 5 6 7 8 9 10 11 12
## 2012 1 2 3 4 5 6 7 8 9 10 11 12
## 2013 1 2 3 4 5 6 7 8 9 10 11 12
## 2014 1 2 3 4 5 6 7 8 9 10 11 12
## 2015 1 2 3 4 5 6 7 8 9 10 11 12
## 2016 1 2 3 4 5 6 7 8 9 10 11 12
## 2017 1 2 3 4 5 6 7 8 9 10 11 12
## 2018 1 2 3 4 5 6 7 8 9 10 11 12
## 2019 1 2 3 4 5 6 7 8
frequency(chi_test)
## [1] 12
cycle(chi_test)
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 2019 9 10 11 12
## 2020 1 2 3 4 5 6 7 8
The Holt-Winters Seasonal Method was selected based on its appropriateness for time series with both a trend and seasonality. Two additive HW models were created - one with and one without damping.
The first model, which does not include damping, does not completely capture the autocorrelation in the time series. There remains significant autocorellation in the residuals, and the residuals are not normally distributed.
HW1 <- hw(chi_train, seasonal = "additive",h=12, damped = FALSE)
plot(HW1)
checkresiduals(HW1)
##
## Ljung-Box test
##
## data: Residuals from Holt-Winters' additive method
## Q* = 70.68, df = 8, p-value = 3.598e-12
##
## Model df: 16. Total lags used: 24
Adding damping does not improve the model. The residuals do not resemble white noise.
HW2 <- hw(chi_train, seasonal = "additive",h=12, damped = TRUE)
plot(HW2)
checkresiduals(HW2)
##
## Ljung-Box test
##
## data: Residuals from Damped Holt-Winters' additive method
## Q* = 68.899, df = 7, p-value = 2.466e-12
##
## Model df: 17. Total lags used: 24
From the EDA, we know that there is significant seasonality in the data, suggesting that a sArima model might be appropriate. We begin exploring ways to make the data stationary.
We first check if a Box-Cox Transformation is necessary, and find that lambda should be set to 1.999 (rounded to 2 for simplicity). Once the Box-Cox Transformation is completed, a KPSS Test for stationarity is completed. The series is not stationary.
BoxCox.lambda(chi_train) # lambda = 1.999924, significantly different from 1
## [1] 1.999924
lambda <- 2 # round up lambda value to 2 for simplicity
transformed <- BoxCox(chi_train, lambda=lambda) # BoxCox Transformation
kpss.test(transformed) # p=0.0141, data post-Box-Cox transformation is not stationary
##
## KPSS Test for Level Stationarity
##
## data: transformed
## KPSS Level = 0.69386, Truncation lag parameter = 4, p-value = 0.0141
The time series becomes stationary after 1st order differencing.
differenced <- diff(transformed) #difference transformed data
kpss.test(differenced) # KPSS test - p=0.1 > 0.05, data is stationary after Box-Cox transformation and 1 round of differencing
## Warning in kpss.test(differenced): p-value greater than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: differenced
## KPSS Level = 0.13632, Truncation lag parameter = 4, p-value = 0.1
tsdisplay(differenced, lag=50) #visualize new dataset
periodogram(differenced) #uncertain of if we only look at the periodgram for raw data
A sArima model is built using auto.arima, with differencing=1 and lambda=2. This results in an ARIMA(2,1,2)(0,1,1)[12] model. The Ljung-Box test shows that there is not significant autocorrelation remaining in the residuals - they resemble white noise.
AR1 <- auto.arima(chi_train, d=1, lambda=lambda)
summary(AR1)
## Series: chi_train
## ARIMA(2,1,2)(0,1,2)[12]
## Box Cox transformation: lambda= 2
##
## Coefficients:
## ar1 ar2 ma1 ma2 sma1 sma2
## -0.1436 -0.3785 -0.1495 0.6499 -0.2587 -0.2835
## s.e. 0.2826 0.1670 0.2313 0.1251 0.1151 0.1079
##
## sigma^2 estimated as 9.664e+17: log likelihood=-2786.77
## AIC=5587.54 AICc=5588.49 BIC=5607.39
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 910.0087 4697.268 3476.997 0.4844208 1.755815 0.2767997 -0.0248055
checkresiduals(AR1)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,1,2)(0,1,2)[12]
## Q* = 23.441, df = 18, p-value = 0.1742
##
## Model df: 6. Total lags used: 24
plot(forecast(AR1))
Further exploration is necessary to see if a superior sArima model can be built.
eacf(chi_train)
## AR/MA
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 0 x x x x x x x x x x x x x x
## 1 x x o o x x x o o x x x x x
## 2 x o x o o o x o o o o x o o
## 3 x o o o o o o o o o o x o o
## 4 o x x o o o o o o o o x o o
## 5 x o o x o o o o o o o x x x
## 6 x x o x x o o o o o o x x o
## 7 x x o x o o o o o o o x x x
We experiment with different combinations of p=1/2, q=1/2, and P=0/1. Of the models tested, only AR2 (Arima(2,1,2)(1,1,1)[12]) passes the Ljung-Box test (assuming 0.05 threshold).
AR2 <- Arima(chi_train, order=c(2,1,2), seasonal=c(1,1,1),lambda=lambda) #p=0.09
summary(AR2)
## Series: chi_train
## ARIMA(2,1,2)(1,1,1)[12]
## Box Cox transformation: lambda= 2
##
## Coefficients:
## ar1 ar2 ma1 ma2 sar1 sma1
## -0.1324 -0.3694 -0.1473 0.6326 0.3773 -0.7245
## s.e. 0.3064 0.1669 0.2496 0.1421 0.1927 0.1599
##
## sigma^2 estimated as 9.937e+17: log likelihood=-2788.21
## AIC=5590.42 AICc=5591.37 BIC=5610.28
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 912.5512 4732.661 3506.983 0.4828152 1.769271 0.2791869
## ACF1
## Training set -0.02600148
checkresiduals(AR2)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,1,2)(1,1,1)[12]
## Q* = 26.573, df = 18, p-value = 0.08736
##
## Model df: 6. Total lags used: 24
AR3 <- Arima(chi_train, order=c(1,1,1), seasonal=c(1,1,1),lambda=lambda) #p=0.005
summary(AR3)
## Series: chi_train
## ARIMA(1,1,1)(1,1,1)[12]
## Box Cox transformation: lambda= 2
##
## Coefficients:
## ar1 ma1 sar1 sma1
## -0.4016 0.0957 0.320 -0.6747
## s.e. 0.1858 0.1938 0.204 0.1732
##
## sigma^2 estimated as 1.052e+18: log likelihood=-2792.51
## AIC=5595.01 AICc=5595.51 BIC=5609.19
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 1060.193 4891.313 3731.734 0.5555307 1.88292 0.2970791
## ACF1
## Training set -0.006238075
checkresiduals(AR3)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,1)(1,1,1)[12]
## Q* = 40.2, df = 20, p-value = 0.004712
##
## Model df: 4. Total lags used: 24
AR4 <- Arima(chi_train, order=c(1,1,2), seasonal=c(0,1,1),lambda=lambda) #p=0.01
summary(AR4)
## Series: chi_train
## ARIMA(1,1,2)(0,1,1)[12]
## Box Cox transformation: lambda= 2
##
## Coefficients:
## ar1 ma1 ma2 sma1
## -0.0480 -0.2462 0.2981 -0.4112
## s.e. 0.2072 0.1836 0.1230 0.1209
##
## sigma^2 estimated as 1.044e+18: log likelihood=-2791.84
## AIC=5593.69 AICc=5594.19 BIC=5607.87
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 785.2016 4810.3 3652.513 0.4181308 1.837683 0.2907724 -0.009892509
checkresiduals(AR4)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,2)(0,1,1)[12]
## Q* = 37.689, df = 20, p-value = 0.009664
##
## Model df: 4. Total lags used: 24
AR5 <- Arima(chi_train, order=c(2,1,1), seasonal=c(0,1,1),lambda=lambda) #p=0.002
summary(AR5)
## Series: chi_train
## ARIMA(2,1,1)(0,1,1)[12]
## Box Cox transformation: lambda= 2
##
## Coefficients:
## ar1 ar2 ma1 sma1
## 0.0366 0.1842 -0.3266 -0.4154
## s.e. 0.4093 0.1456 0.4079 0.1186
##
## sigma^2 estimated as 1.071e+18: log likelihood=-2793.38
## AIC=5596.77 AICc=5597.27 BIC=5610.95
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 880.3428 4880.438 3743.138 0.4651391 1.882288 0.2979869
## ACF1
## Training set -0.02164737
checkresiduals(AR5)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,1,1)(0,1,1)[12]
## Q* = 43.317, df = 20, p-value = 0.001856
##
## Model df: 4. Total lags used: 24
Next, we compare the AICc and BIC scores to the two models that passed the Ljung-Box Test. The model selected by the auto.arima superior has lower AICc and BIC values, showing that it is the better choice.
AR_AB <- c(AR1$aicc, AR2$aicc, AR1$bic, AR2$bi)
names=c("AR1: AICc","AR2: AICc","AR1: BIC","AR2: BIC")
barplot(AR_AB, names=names, ylim=c(5585,5615), main="Arima Models", col='azure')
Next, we use the ARIMA(2,1,2)(0,1,1)[12] model to forecast results for the next year. The “Actual vs. Predicted” chart shows that the model does a decent job of forecasting values pre-COVID, but is not capable of accurately predicted the trend for the past few months.
#model 1
AR1_fcast <- forecast(AR1, h=12)
plot(AR1_fcast, chi_test)
tsplot(cbind(chi_test, AR1_fcast$mean), plot_title="Actual vs. Predicted")
MAE(AR1_fcast$mean, chi_test)
## [1] 4971.849
RMSE(AR1_fcast$mean, chi_test)
## [1] 7592.068
Next we do cross-validation to understand how the model’s performance evolves over time.
k <- 72 # minimum observations (6 years)
n <- length(chi_median) # Number of data points
p <- 12 # Period
H <- 12 # Forecast horizon
defaultW <- getOption("warn")
options(warn = -1)
st <- tsp(chi_median)[1]+(k-2)/p # gives the start time in time units,
error_AR1 <- matrix(NA,n-k,H) # One-year forecast horizon error for each window
AICc_AR1 <- matrix(NA,n-k) # Estimated model AICc value for each wndow
for(i in 1:(n-k))
{
# rolling window
train <- window(chi_median, start=st+(i-k+1)/p, end=st+i/p) ## Window Length: k
test <- window(chi_median, start=st + (i+1)/p, end=st+(i+H)/p) ## Window Length: H
#Arima Models
AR12 <- Arima(train, order=c(2,1,2), seasonal=list(order=c(0,1,1), period=p), lambda='auto', method='ML') #ARIMA(2,1,2)(0,1,1)[12]
fcast_AR1 <- forecast(AR12, h=H)
# Error & AICc
error_AR1[i,1:length(test)] <- fcast_AR1[['mean']]-test
AICc_AR1[i] <- AR12$aicc
}
MAE_AR1 <- colMeans(abs(error_AR1), na.rm=TRUE)
RMSE_AR1 <- sqrt(colMeans(error_AR1**2, na.rm=TRUE))
plot(1:12, MAE_AR1, type="l",col=3,xlab="Horizon", ylab="Error", ylim=c(3600,8000))
lines(1:12, RMSE_AR1, type="l",col=2)
legend("topleft",legend=c("AR1 - MAE", "AR - RMSE"),col=1:4,lty=1)
plot(1:79, AICc_AR1, type="l",col=1,xlab="Iterations", ylab="AICc", ylim=c(750,2650))
legend("bottomright",legend="AR1 - AICc",col=1:4,lty=1)
As previously mentioned, the sArima model was not able to capture the changed environment due to COVID. Therefore, we will try Regression with Arima errors, in order to include other leading predictors.
We tried Google Trends data for two search terms: “homes for sale” and “realtor”.
First, we import the datasets and double-check dimensions.
homes <- read_csv("home for sale.csv") #Google Trends "home for sale" Data
## Parsed with column specification:
## cols(
## Month = col_character(),
## `Home for sale` = col_double()
## )
realtor <- read_csv("realtor.csv") #Google Trends "realtor" Data
## Parsed with column specification:
## cols(
## Month = col_character(),
## Realtor = col_double()
## )
homes$Month <- yearmonth(homes$Month)
realtor$Month <- yearmonth(realtor$Month)
dim(homes)
## [1] 151 2
dim(realtor)
## [1] 151 2
sum(is.na(homes))
## [1] 0
sum(is.na(realtor))
## [1] 0
Store as TS object
homes_ts <- ts(homes$`Home for sale`, frequency=12, start=c(2008,2))
realtor_ts <- ts(realtor$Realtor, frequency=12, start=c(2008,2))
Visualize the median sale price (scaled) dataset and the two Google Trends datasets. They show similar seasonality, suggesting that they might be appropriate predictors.
chi_median_scale <- chi_median/1000
tsplot(cbind(chi_median_scale, homes_ts, realtor_ts), plot_title="Trends")
Train/test split
home_train<-window(homes_ts, end = 2019.6) #end train in August 2019
home_test<-window(homes_ts, start = 2019.6) #use last year of dataset as test
realtor_train<-window(realtor_ts, end = 2019.6) #end train in August 2019
realtor_test<-window(realtor_ts, start = 2019.6) #use last year of dataset as test
Transform realtor data with a lambda value of 0.3 prior to fitting the regression model.
lambda_realtor <- BoxCox.lambda(realtor_train) # lambda = 0.3, significantly different from 1
transformed_realtor <- BoxCox(realtor_train, lambda=lambda_realtor) # BoxCox Transformation
The regression model using the transformed_realtor dataset does not pass the Ljung-Box test for stationarity.
RAR1 <- auto.arima(y=chi_train, lambda=lambda, xreg=transformed_realtor) #auto.arima model
summary(RAR1) # b) summary
## Series: chi_train
## Regression with ARIMA(0,1,0)(0,1,0)[12] errors
## Box Cox transformation: lambda= 2
##
## Coefficients:
## xreg
## 276301056
## s.e. 146286849
##
## sigma^2 estimated as 1.354e+18: log likelihood=-2808.52
## AIC=5621.04 AICc=5621.14 BIC=5626.71
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 524.6034 5691.497 4194.314 0.2686517 2.123857 0.3339045 -0.3945342
checkresiduals(RAR1) # c) check residuals
##
## Ljung-Box test
##
## data: Residuals from Regression with ARIMA(0,1,0)(0,1,0)[12] errors
## Q* = 80.737, df = 23, p-value = 2.412e-08
##
## Model df: 1. Total lags used: 24
The “home for sale” dataset did not need to be transformed.
BoxCox.lambda(home_train) # lambda = 0.9, similar to 1 so won't transform
## [1] 0.9184948
Similar to the other regression model, there remains significant autocrrelation in the residuals for this model as well.
RAR2 <- auto.arima(y=chi_train, lambda=lambda, xreg=home_train) #auto.arima model
summary(RAR2) # b) summary
## Series: chi_train
## Regression with ARIMA(0,1,0)(0,1,0)[12] errors
## Box Cox transformation: lambda= 2
##
## Coefficients:
## xreg
## 18125128
## s.e. 10579886
##
## sigma^2 estimated as 1.334e+18: log likelihood=-2807.56
## AIC=5619.12 AICc=5619.22 BIC=5624.79
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 527.7724 5658.816 4153.264 0.2705263 2.107745 0.3306366 -0.3652482
checkresiduals(RAR2) # c) check residuals
##
## Ljung-Box test
##
## data: Residuals from Regression with ARIMA(0,1,0)(0,1,0)[12] errors
## Q* = 73.9, df = 23, p-value = 2.996e-07
##
## Model df: 1. Total lags used: 24
Save as TS object, do train/test split
gtrends <- cbind(homes, realtor$Realtor)
gtrends_ts <- ts(gtrends, frequency=12, start=c(2008,2))
gtrends_ts <- cbind(gtrends_ts[,'Home for sale'], gtrends_ts[,'realtor$Realtor'])
gtrends_train<-window(gtrends_ts, end = 2019.6) #end train in August 2019
gtrends_test<-window(gtrends_ts, start = 2019.6) #use last year of dataset as test
As with other regression models, there remains significant autocorrelation in the residuals. Will experiment with other predictors later time permitting.
RAR3 <- auto.arima(y=chi_train, lambda=lambda, xreg=gtrends_train) #auto.arima model
summary(RAR3) # b) summary
## Series: chi_train
## Regression with ARIMA(0,1,0)(0,1,0)[12] errors
## Box Cox transformation: lambda= 2
##
## Coefficients:
## gtrends_ts[, "Home for sale"] gtrends_ts[, "realtor$Realtor"]
## 17242596 7887560
## s.e. 10260298 13730589
##
## sigma^2 estimated as 1.342e+18: log likelihood=-2807.41
## AIC=5620.83 AICc=5621.03 BIC=5629.34
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 523.5547 5648.847 4145.484 0.2681766 2.10269 0.3300172 -0.3662402
checkresiduals(RAR3) # c) check residuals
##
## Ljung-Box test
##
## data: Residuals from Regression with ARIMA(0,1,0)(0,1,0)[12] errors
## Q* = 74.904, df = 22, p-value = 1.095e-07
##
## Model df: 2. Total lags used: 24
Recall Periodgram points to two major spikes.
temp <- periodogram(chi_train)
spec_p <- matrix(round(temp$spec/1e5, 2))
freq_p <- matrix(temp$freq)
cbind(spec_p, freq_p) # two major spikes at 0.00625 and 0.08125
## [,1] [,2]
## [1,] 572397.76 0.006944444
## [2,] 40650.63 0.013888889
## [3,] 2355.93 0.020833333
## [4,] 7906.67 0.027777778
## [5,] 8553.56 0.034722222
## [6,] 5616.07 0.041666667
## [7,] 6845.62 0.048611111
## [8,] 1028.18 0.055555556
## [9,] 4165.32 0.062500000
## [10,] 5444.90 0.069444444
## [11,] 2769.42 0.076388889
## [12,] 217995.33 0.083333333
## [13,] 20085.74 0.090277778
## [14,] 942.00 0.097222222
## [15,] 1369.46 0.104166667
## [16,] 405.27 0.111111111
## [17,] 2350.98 0.118055556
## [18,] 1266.73 0.125000000
## [19,] 455.99 0.131944444
## [20,] 1762.04 0.138888889
## [21,] 582.42 0.145833333
## [22,] 1400.40 0.152777778
## [23,] 186.61 0.159722222
## [24,] 6555.05 0.166666667
## [25,] 3283.36 0.173611111
## [26,] 102.17 0.180555556
## [27,] 113.86 0.187500000
## [28,] 393.28 0.194444444
## [29,] 23.64 0.201388889
## [30,] 41.94 0.208333333
## [31,] 13.99 0.215277778
## [32,] 136.43 0.222222222
## [33,] 485.21 0.229166667
## [34,] 306.49 0.236111111
## [35,] 228.94 0.243055556
## [36,] 4207.25 0.250000000
## [37,] 537.30 0.256944444
## [38,] 386.98 0.263888889
## [39,] 211.18 0.270833333
## [40,] 179.81 0.277777778
## [41,] 436.47 0.284722222
## [42,] 408.08 0.291666667
## [43,] 361.11 0.298611111
## [44,] 534.24 0.305555556
## [45,] 496.54 0.312500000
## [46,] 646.79 0.319444444
## [47,] 617.71 0.326388889
## [48,] 1392.31 0.333333333
## [49,] 235.67 0.340277778
## [50,] 280.66 0.347222222
## [51,] 832.83 0.354166667
## [52,] 844.08 0.361111111
## [53,] 140.30 0.368055556
## [54,] 25.36 0.375000000
## [55,] 370.71 0.381944444
## [56,] 12.25 0.388888889
## [57,] 215.70 0.395833333
## [58,] 366.16 0.402777778
## [59,] 258.74 0.409722222
## [60,] 624.84 0.416666667
## [61,] 155.58 0.423611111
## [62,] 100.53 0.430555556
## [63,] 409.91 0.437500000
## [64,] 155.03 0.444444444
## [65,] 290.05 0.451388889
## [66,] 11.59 0.458333333
## [67,] 295.37 0.465277778
## [68,] 739.64 0.472222222
## [69,] 349.04 0.479166667
## [70,] 20.73 0.486111111
## [71,] 290.61 0.493055556
## [72,] 1659.42 0.500000000
(1/0.00625)/12 # once every 13 years??
## [1] 13.33333
(1/0.08125)/12 # once every year??
## [1] 1.025641
We create a regression model using a Fourier transformation (K=2) as the predictor, and Arima errors. This results in an ARIMA(0,2,3)(1,0,0)[12] model. The residuals are stationary and relatively normally distributed, signaling that they are white noise.
DHR1 <- auto.arima(chi_train, xreg=fourier(chi_train, K=2)) # in example seasonal=FALSE, not sure if that should be the setting here?
summary(DHR1)
## Series: chi_train
## Regression with ARIMA(0,2,3)(1,0,0)[12] errors
##
## Coefficients:
## ma1 ma2 ma3 sar1 S1-12 C1-12 S2-12
## -1.2948 0.5287 -0.2133 0.6168 1969.158 -15229.860 -2614.3153
## s.e. 0.0831 0.1542 0.1143 0.0702 2140.429 2149.604 980.9099
## C2-12
## 1346.945
## s.e. 977.806
##
## sigma^2 estimated as 24592403: log likelihood=-1359.98
## AIC=2737.96 AICc=2739.38 BIC=2764.24
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 629.7414 4777.36 3773.858 0.2944363 1.88085 0.3004325 -0.02342825
checkresiduals(DHR1)
##
## Ljung-Box test
##
## data: Residuals from Regression with ARIMA(0,2,3)(1,0,0)[12] errors
## Q* = 25.555, df = 16, p-value = 0.06062
##
## Model df: 8. Total lags used: 24
This model is less accurate than the sArima model used for prediction earlier, although both models struggle with the same thing: predicting values during the COVID period.
#model 2
DHR1_fcast <- forecast(DHR1, h=12, xreg=fourier(chi_train, K=2, h=12))
plot(DHR1_fcast, chi_test)
tsplot(cbind(chi_test, DHR1_fcast$mean), plot_title="Actual vs. Predicted")
MAE(DHR1_fcast$mean, chi_test)
## [1] 5604.526
RMSE(DHR1_fcast$mean, chi_test)
## [1] 8589.457
Will come back to this
#arimax(y,order=c(1,0,1), xreg=data.frame(x=c(rep(0,200),1:200)))
#xreg=data.frame(x=c(rep(0,200),1:200)))